library(tidyverse)
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
all_counts <- readRDS(file= "Data/pc_count_matrix.rds")
avg_counts <- readRDS(file="Data/avg_pc_count_matrix.rds")
diff_counts <- readRDS(file="Data/diff_pc_count_matrix.rds")
meta <- suppressMessages(readr::read_tsv("Data/complete_meta_data.tsv"))
tissue_types <- unique(meta %>% pull(tissue_type))
TFs<- c("Ascl1", "Hes1", "Neurod1", "Mecp2", "Mef2c", "Runx1", "Tcf4", "Pax6")

Investigate the most highly expressed gene (sum of all stages) for each tissue type

maxes_list <- list()
for (tissue in tissue_types){
  tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
  all_sums <- apply(all_counts[,tissue_subset_ids],1, sum)
  maxes_list[tissue] <- names(all_sums)[which.max(all_sums)]
}
for (name in names(maxes_list)){
  print(paste0("The most highly expressed protein in ", name, " : ", maxes_list[name]))
}
## [1] "The most highly expressed protein in forebrain : Tuba1a"
## [1] "The most highly expressed protein in midbrain : Tuba1a"
## [1] "The most highly expressed protein in hindbrain : Tuba1a"
## [1] "The most highly expressed protein in embryonic facial prominence : Hba-a2"
## [1] "The most highly expressed protein in heart : Hba-a2"
## [1] "The most highly expressed protein in limb : Hba-a2"
## [1] "The most highly expressed protein in liver : Hba-a2"
## [1] "The most highly expressed protein in neural tube : Tuba1a"
## [1] "The most highly expressed protein in lung : Sftpc"
## [1] "The most highly expressed protein in stomach : Cym"
## [1] "The most highly expressed protein in intestine : mt-Atp6"
## [1] "The most highly expressed protein in kidney : mt-Atp6"
## [1] "The most highly expressed protein in thymus : Actb"
## [1] "The most highly expressed protein in skeletal muscle tissue : Acta1"
## [1] "The most highly expressed protein in spleen : Hba-a2"
## [1] "The most highly expressed protein in urinary bladder : Actg2"
## [1] "The most highly expressed protein in adrenal gland : mt-Atp6"

Same thing as above but excluding the mitochondria

maxes_list <- list()
for (tissue in tissue_types){
  tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
  all_sums <- apply(all_counts[,tissue_subset_ids],1, sum)
  all_sums <- all_sums[!sapply(names(all_sums), startsWith, "mt-")]
  maxes_list[tissue] <- names(all_sums)[which.max(all_sums)]
}
for (name in names(maxes_list)){
  print(paste0("The most highly expressed protein in ", name, " : ", maxes_list[name]))
}
## [1] "The most highly expressed protein in forebrain : Tuba1a"
## [1] "The most highly expressed protein in midbrain : Tuba1a"
## [1] "The most highly expressed protein in hindbrain : Tuba1a"
## [1] "The most highly expressed protein in embryonic facial prominence : Hba-a2"
## [1] "The most highly expressed protein in heart : Hba-a2"
## [1] "The most highly expressed protein in limb : Hba-a2"
## [1] "The most highly expressed protein in liver : Hba-a2"
## [1] "The most highly expressed protein in neural tube : Tuba1a"
## [1] "The most highly expressed protein in lung : Sftpc"
## [1] "The most highly expressed protein in stomach : Cym"
## [1] "The most highly expressed protein in intestine : Apoa1"
## [1] "The most highly expressed protein in kidney : Hba-a2"
## [1] "The most highly expressed protein in thymus : Actb"
## [1] "The most highly expressed protein in skeletal muscle tissue : Acta1"
## [1] "The most highly expressed protein in spleen : Hba-a2"
## [1] "The most highly expressed protein in urinary bladder : Actg2"
## [1] "The most highly expressed protein in adrenal gland : Hba-a2"

Top 5 Universally highly expressed protein across all tissues regardless of stages (without processing)

##Collapse dev_stages
collapsed_matrices <- matrix(data=NA, nrow = nrow(all_counts), ncol = length(tissue_types))
rownames(collapsed_matrices) <- rownames(all_counts)
walker <- 1
for (tissue in tissue_types){
  tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
  all_avgs <- apply(all_counts[,tissue_subset_ids],1, mean)
  collapsed_matrices[,walker] <- all_avgs
  walker <- walker + 1
}
all_sums <- apply(collapsed_matrices,1, sum)
for (i in 1:10){
  print(paste0("The highests expressed protein acrossed all tissue in order ",i ,": ", names(sort(all_sums,decreasing=TRUE))[i]))
}
## [1] "The highests expressed protein acrossed all tissue in order 1: Hba-a2"
## [1] "The highests expressed protein acrossed all tissue in order 2: Hbb-bs"
## [1] "The highests expressed protein acrossed all tissue in order 3: mt-Atp6"
## [1] "The highests expressed protein acrossed all tissue in order 4: Hba-a1"
## [1] "The highests expressed protein acrossed all tissue in order 5: Hbb-bt"
## [1] "The highests expressed protein acrossed all tissue in order 6: mt-Atp8"
## [1] "The highests expressed protein acrossed all tissue in order 7: Actb"
## [1] "The highests expressed protein acrossed all tissue in order 8: Ftl1"
## [1] "The highests expressed protein acrossed all tissue in order 9: Ppia"
## [1] "The highests expressed protein acrossed all tissue in order 10: Acta1"

Expression profile of the highest expressed protein across all tissues Hba-a2 Extract only Hba-a2 information

library(ggplot2)
target_protein <- "Hba-a2"
unique_dev_stages <- unique(meta %>% pull(dev_stage))
Hba_matrix <- matrix(data=NA, nrow=length(tissue_types), ncol=3)
colnames(Hba_matrix) <- c("count", "dev_stage", "tissue_type")
Hba_frame <- data.frame(Hba_matrix)
walker <- 1
for (exp_id in names(avg_counts[target_protein,])){
    Hba_frame[walker, "count"] <- avg_counts[target_protein, exp_id]
    Hba_frame[walker, "dev_stage"] <- meta %>% filter(id==exp_id) %>% pull(dev_stage)
    Hba_frame[walker, "tissue_type"] <- meta %>% filter(id==exp_id) %>% pull(tissue_type)
    walker <- walker + 1
}
Hba_frame$count <- log2(Hba_frame$count) 

Plot the extracted information

library(ggrepel)
make_label <- function(row){
  if (row['dev_stage'] == max(Hba_frame %>% filter(tissue_type==row['tissue_type']) %>% pull(dev_stage))){
    #print(max(Hba_frame %>% filter(tissue_type==tissue) %>% pull(dev_stage)))\
     return(as.character(row['tissue_type']))
  }else {

    return(NA_character_)
  }
}
Hba_frame[,"label"] <- Hba_frame %>% apply(1,make_label)
# Hba_frame <- Hba_frame %>%
#   mutate(label = if_else(dev_stage == max(Hba_frame %>% filter()), as.character(tissue_type), NA_character_))
ggplot(Hba_frame, aes(x=dev_stage, y=count, group=tissue_type, colour=tissue_type)) + geom_line() + geom_point() + expand_limits(x= c(4, 10)) +
  # geom_text_repel(
  #   aes(label = gsub("^.*$", " ", label)),
  #   # This will force the correct position of the link's right end.   

  #   segment.curvature = -0.1,
  #   segment.square = TRUE,
  #   segment.color = 'grey',
  #   box.padding = 0.1,
  #   point.padding = 0.6,
  #   nudge_x = 1,
  #   nudge_y = 1,
  #   force = 15,
  #   hjust = 0,
  #   direction = "y",
  #   na.rm = TRUE,
  #   xlim = c(5, 10),
  #   ylim = c(0, 150000),
  # ) +
  geom_text_repel(data = . %>% filter(!is.na(label)),
                  aes(label = paste0("  ", label)),
                  #segment.alpha = 0, ## This will 'hide' the link
                  segment.curvature = -0.1,
                  segment.square = TRUE,
                  # segment.color = 'grey',
                  box.padding = 0.1,
                  point.padding = 0.6,
                  nudge_x = 1,
                  nudge_y = 1,
                 force = 0.5,
                  hjust = 0,
                  direction="y",
                  na.rm = TRUE, 
                  xlim = c(5, 10),
                  ylim = c(10,17),
)                   +
  ylab("counts(log2)") + ggtitle("Expression pattern of Hba-a2 across tissues") + theme(plot.title = element_text(hjust = 0.5))

Top 5 Universally highly expressed protein across all tissues regardless of stages (by percentage in tissues)

##Collapse dev_stages
collapsed_matrices <- matrix(data=NA, nrow = nrow(all_counts), ncol = length(tissue_types))
rownames(collapsed_matrices) <- rownames(all_counts)
walker <- 1
for (tissue in tissue_types){
  tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
  all_avgs <- apply(all_counts[,tissue_subset_ids],1, mean)
  percent_counts <- all_avgs/sum(all_avgs)
  collapsed_matrices[,walker] <- percent_counts
  walker <- walker + 1
}
all_sums <- apply(collapsed_matrices,1, sum)
for (i in 1:10){
  print(paste0("The highests expressed protein acrossed all tissue in order ",i ,": ", names(sort(all_sums,decreasing=TRUE))[i]))
}
## [1] "The highests expressed protein acrossed all tissue in order 1: Hba-a2"
## [1] "The highests expressed protein acrossed all tissue in order 2: Hbb-bs"
## [1] "The highests expressed protein acrossed all tissue in order 3: mt-Atp6"
## [1] "The highests expressed protein acrossed all tissue in order 4: Hba-a1"
## [1] "The highests expressed protein acrossed all tissue in order 5: mt-Atp8"
## [1] "The highests expressed protein acrossed all tissue in order 6: Hbb-bt"
## [1] "The highests expressed protein acrossed all tissue in order 7: Actb"
## [1] "The highests expressed protein acrossed all tissue in order 8: Ftl1"
## [1] "The highests expressed protein acrossed all tissue in order 9: Ppia"
## [1] "The highests expressed protein acrossed all tissue in order 10: Acta1"

Top 5 Universally highly expressed protein across all tissues regardless of stages (by ranking)

##Collapse dev_stages
collapsed_matrices <- matrix(data=NA, nrow = nrow(all_counts), ncol = length(tissue_types))
rownames(collapsed_matrices) <- rownames(all_counts)
walker <- 1
for (tissue in tissue_types){
  tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
  all_avgs <- apply(all_counts[,tissue_subset_ids],1, mean)
  all_ranking <- rank(all_avgs)
  collapsed_matrices[,walker] <- all_ranking
  walker <- walker + 1
}
all_sums <- apply(collapsed_matrices,1, sum)
for (i in 1:10){
  print(paste0("The highests expressed protein acrossed all tissue in order ",i ,": ", names(sort(all_sums, decreasing = TRUE))[i]))
}
## [1] "The highests expressed protein acrossed all tissue in order 1: mt-Atp6"
## [1] "The highests expressed protein acrossed all tissue in order 2: mt-Atp8"
## [1] "The highests expressed protein acrossed all tissue in order 3: Hba-a2"
## [1] "The highests expressed protein acrossed all tissue in order 4: Actb"
## [1] "The highests expressed protein acrossed all tissue in order 5: Ppia"
## [1] "The highests expressed protein acrossed all tissue in order 6: Ftl1"
## [1] "The highests expressed protein acrossed all tissue in order 7: mt-Nd1"
## [1] "The highests expressed protein acrossed all tissue in order 8: Eef1a1"
## [1] "The highests expressed protein acrossed all tissue in order 9: mt-Cytb"
## [1] "The highests expressed protein acrossed all tissue in order 10: mt-Co1"

expression distribution acrossed different tissues (box plot)

library(cowplot)
all_counts_melted <- data.frame(all_counts) %>% gather("id", "count")
all_counts_melted <- all_counts_melted %>% inner_join((meta %>% select(id, tissue_type, dev_stage)), by="id")
  ggplot(data=all_counts_melted, aes(x=dev_stage, y=log2(count+1))) + geom_boxplot() +  facet_wrap(~ tissue_type, ncol=3) + theme_cowplot(12) + geom_hline(yintercept=median(log2(all_counts+1)), colour="red")

expression distribution acrossed different tissues (histogram)

all_counts_melted <- data.frame(all_counts) %>% gather("id", "count")
all_counts_melted <- all_counts_melted %>% inner_join((meta %>% select(id, tissue_type, dev_stage)), by="id")
  suppressWarnings(ggplot(data=all_counts_melted, aes(x=log2(count+1), fill=dev_stage)) + geom_histogram(binwidth = 0.5) +  facet_wrap(~ tissue_type, ncol=3) + theme_cowplot(12))

expression distribution acrossed different tissues (histogram log transformed)

all_counts_melted <- data.frame(all_counts) %>% gather("id", "count")
all_counts_melted <- all_counts_melted %>% inner_join((meta %>% select(id, tissue_type, dev_stage)), by="id")
  suppressWarnings(ggplot(data=all_counts_melted, aes(x=log2(count+1), y=..density.. ,fill=dev_stage)) + geom_histogram(binwidth = 0.5) +  facet_wrap(~ tissue_type, ncol=3) + theme_cowplot(12))

expression distribution acrossed different tissues (box plot collapsed dev_stages)

all_counts_melted <- data.frame(all_counts) %>% gather("id", "count")
all_counts_melted <- all_counts_melted %>% inner_join((meta %>% select(id, tissue_type, dev_stage)), by="id")
  ggplot(data=all_counts_melted, aes(y=log2(count+1))) + geom_boxplot() +  facet_grid(. ~ tissue_type) + theme_cowplot(12) + theme(
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) + xlab("tissue_types") + geom_hline(yintercept=median(log2(all_counts+1)), colour="red")

expression distribution acrossed different tissues (histogram collapsed by dev_stage)

all_counts_melted <- data.frame(all_counts) %>% gather("id", "count")
all_counts_melted <- all_counts_melted %>% inner_join((meta %>% select(id, tissue_type, dev_stage)), by="id")
  suppressWarnings(ggplot(data=all_counts_melted, aes(x=log2(count+1), fill=tissue_type, y=..density..)) + geom_histogram(binwidth = 0.5) + theme_cowplot(12))  

expression distribution acrossed different tissues (freq_plot collapsed by dev_stage)

all_counts_melted <- data.frame(all_counts) %>% gather("id", "count")
all_counts_melted <- all_counts_melted %>% inner_join((meta %>% select(id, tissue_type, dev_stage)), by="id")
  ggplot(data=all_counts_melted, aes(x=log2(count+1), y=..density.., colour=tissue_type)) + geom_freqpoly(binwidth=0.5) + theme_cowplot(12) + scale_x_continuous(limits=c(0, 20))

Mean-variance

library(ggrepel)
all_counts_mean_var <- all_counts
all_counts_mean_var <- cbind(all_counts_mean_var, mean=apply(all_counts,1,mean))
all_counts_mean_var <- cbind(all_counts_mean_var, variance=apply(all_counts,1,var))
ggplot(data=data.frame(all_counts_mean_var), aes(x=log2(mean+1), y=log2(variance+1))) + geom_point() + theme_cowplot(12) + 
  geom_text_repel(data=data.frame(all_counts_mean_var) %>% rownames_to_column(var="rowname") %>% slice_max(variance, n=10), aes(x=log2(mean+1), y=log2(variance), label=rowname, colour="red")) +
  geom_text_repel(data=data.frame(all_counts_mean_var) %>% rownames_to_column(var="rowname") %>% slice_max(mean, n=10), aes(x=log2(mean+1), y=log2(variance), label=rowname, colour="blue")) + scale_colour_discrete(name = "Top elements", labels = c("Variance", "Mean"))

Mean and variance all together but hue by organism type

collapsed_matrices <- data.frame(mean=double(), variance=double(), tissue=character(), row=character())
colnames(collapsed_matrices) <- c("mean", "variance", "tissue")
for (tissue in tissue_types){
  tissue_subset_ids <- meta %>% filter(tissue_type == tissue) %>% pull(id)
  all_means <- apply(all_counts[,tissue_subset_ids],1, mean)
  all_variance <- apply(all_counts[,tissue_subset_ids],1, var)
  temp_matrix <- data.frame(mean=all_means, variance=all_variance, tissue) %>% rownames_to_column("rowname")
  collapsed_matrices <- rbind(collapsed_matrices, temp_matrix)
}
ggplot(data=collapsed_matrices, aes(x=log2(mean+1), y=log2(variance+1), colour=tissue)) + geom_point()+ theme_cowplot(12) + 
  geom_text_repel(data=collapsed_matrices %>% slice_max(variance+mean, n=15), aes(x=log2(mean+1), y=log2(variance+1), label=rowname, colour=tissue)) 

  #geom_text_repel(data=collapsed_matrices %>% rownames_to_column("row") %>% slice_max(mean, n=10), aes(x=log2(mean+1), y=log2(variance+1), label=row, colour=tissue))
  
  #scale_colour_discrete(name = "Top elements", labels = c("Variance", "Mean"))

Variability across tissue types

ggplot(data=collapsed_matrices, aes(x=log2(mean+1), y=log2(variance+1))) + geom_point() +  facet_wrap(~ tissue, ncol=3) + theme_cowplot(12) 

Zooming in on highly epxressed gene (>1025)

ggplot(data=collapsed_matrices  %>% filter(log2(mean+1) >=10), aes(x=log2(mean+1), y=log2(variance+1))) + geom_point() +  facet_wrap(~ tissue, ncol=3) + theme_cowplot(12) 

Distribution of the TFs

TFCounts <- data.frame(all_counts[TFs,])
TFCounts <- TFCounts %>% rownames_to_column("rowname")
TFCounts <- TFCounts %>% gather("id", "counts", -rowname)
TFCounts <- TFCounts %>% left_join((meta %>% select(tissue_type, dev_stage, id)), by="id")

ggplot(data=TFCounts, aes(x=counts)) + geom_histogram(bins = 100) + facet_grid(~  rowname) + theme_cowplot(12)

Distribution of the TFs (Log transformed)

TFCounts <- data.frame(all_counts[TFs,])
TFCounts <- TFCounts %>% rownames_to_column("rowname")
TFCounts <- TFCounts %>% gather("id", "counts", -rowname)
TFCounts <- TFCounts %>% left_join((meta %>% select(tissue_type, dev_stage, id)), by="id")

ggplot(data=TFCounts, aes(x=log2(counts+1))) + geom_histogram(bins = 100) + facet_grid(~  rowname) + theme_cowplot(12)

Distribution of the TFs (Log transformed)

TFCounts <- data.frame(all_counts[TFs,])
TFCounts <- TFCounts %>% rownames_to_column("rowname")
TFCounts <- TFCounts %>% gather("id", "counts", -rowname)
TFCounts <- TFCounts %>% left_join((meta %>% select(tissue_type, dev_stage, id)), by="id")

ggplot(data=collapsed_matrices %>% filter(rowname %in% TFs), aes(x=log2(mean+1), y=log2(variance+1), colour=tissue)) + geom_point()+ theme_cowplot(12) + 
  geom_text_repel(data=collapsed_matrices %>% filter(rowname %in% TFs) %>% slice_max(variance+mean, n=15), aes(x=log2(mean+1), y=log2(variance+1), label=rowname, colour=tissue)) 

look at 10 genese Completely different expression pattern in different tissues

variability (across tissues, within tissues as well) mean-variance is the ranking similar across tissues

##Remove zero variance 
all_counts_drop_var <- (t(all_counts))[, which(apply(t(all_counts),2,var) !=0)]
all_counts.pca <- prcomp(all_counts_drop_var, center = TRUE,scale. = TRUE)
library(ggfortify)
autoplot(all_counts.pca, data=meta, colour="dev_stage") + theme_cowplot(12)